import csv
import json

from docx import Document
import re


def extract_disease_name(text):
    """从带序号的文本中提取病名，例如 '(二)咳嗽' → '咳嗽'"""
    match = re.search(r'）\s*(.*)', text)
    return match.group(1).strip() if match else text.strip()


def clean_entry_text(text):
    """去除项目符号（如 '', '•'）并清理文本"""
    return re.sub(r'^[\u2022\u25AA]+', '', text.strip()).strip()


def process_document(file_path):
    doc = Document(file_path)
    entries = []
    current_entry = None

    for paragraph in doc.paragraphs:
        style_name = paragraph.style.name
        text = paragraph.text.strip()

        if style_name == 'Normal':
            # 新的病名开始，保存前一个条目
            if current_entry:
                entries.append(current_entry)
            disease_name = extract_disease_name(text)
            current_entry = {
                "病名": disease_name,
                "歌诀": "",
                "证型": []
            }
        elif style_name == 'No Spacing' and current_entry:
            if not current_entry["歌诀"]:
                # 第一个 No Spacing 是歌诀
                current_entry["歌诀"] = text
            else:
                # 其余为证型（去除项目符号）
                cleaned_text = clean_entry_text(text)
                if cleaned_text:
                    current_entry["证型"].append(cleaned_text)

    # 添加最后一个条目
    if current_entry:
        entries.append(current_entry)

    for i in entries:
        zx = ""
        for j in i['证型']:
            if "：" not in j:
                # print(j)
                zx += f"<h4>{j}</h4>"
            else:
                zx += j + "<br>"
        i['zx'] = zx.rstrip("<br>")

    return entries


def save_to_json(entries, output_path):
    """将结果保存为 JSON 文件"""
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(entries, f, ensure_ascii=False, indent=4)


def save_csv(entries, output_path):
    # 打开一个CSV文件以写入数据
    with open(output_path, 'w', newline='', encoding='utf8') as csvfile:
        # 创建一个CSV writer对象
        writer = csv.writer(csvfile)

        # # 写入CSV文件的列名（可选，但推荐）
        # writer.writerow(['病名', '歌诀', '证型'])

        # 遍历JSON数据并写入CSV文件
        for row in entries:
            writer.writerow([row['病名'], row['歌诀'], row['zx']])

if __name__ == "__main__":
    input_file =  r'E:\BaiduNetdiskWorkspace\1701864\123同步文件夹\确有专长笔记\中医内科学\实践技能-歌诀.docx'

    output_file = "output.json"
    output_path = "output1.csv"

    try:
        entries = process_document(input_file)
        print(f"共提取到 {len(entries)} 个病名条目")
        save_to_json(entries, output_file)
        save_csv(entries, output_path)
        print(f"结果已保存到 {output_file}")
    except Exception as e:
        print(f"错误：{str(e)}")